<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Finding Distinct Counts | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="_approximate_aggregations.html" title="Approximate Aggregations">
<link rel="prev" href="_approximate_aggregations.html" title="Approximate Aggregations">
<link rel="next" href="percentiles.html" title="Calculating Percentiles">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="aggregations.html">Aggregations</a></span>
»
<span class="breadcrumb-link"><a href="_approximate_aggregations.html">Approximate Aggregations</a></span>
»
<span class="breadcrumb-node">Finding Distinct Counts</span>
</div>
<div class="navheader">
<span class="prev">
<a href="_approximate_aggregations.html">« Approximate Aggregations</a>
</span>
<span class="next">
<a href="percentiles.html">Calculating Percentiles »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="cardinality"></a>Finding Distinct Counts<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/300_Aggregations/60_cardinality.asciidoc">edit</a>
</h2>
</div></div></div>
<p>The first approximate aggregation provided by Elasticsearch is the <code class="literal">cardinality</code>
metric.  This provides the cardinality of a field, also called a <em>distinct</em> or
<em>unique</em> count.  You may be familiar with the SQL version:</p>
<div class="pre_wrapper lang-sql">
<pre class="programlisting prettyprint lang-sql">SELECT COUNT(DISTINCT color)
FROM cars</pre>
</div>
<p>Distinct counts are a common operation, and answer many fundamental business questions:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
How many unique visitors have come to my website?
</li>
<li class="listitem">
How many unique cars have we sold?
</li>
<li class="listitem">
How many distinct users purchased a product each month?
</li>
</ul>
</div>
<p>We can use the <code class="literal">cardinality</code> metric to determine the number of car colors being
sold at our dealership:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /cars/transactions/_search
{
    "size" : 0,
    "aggs" : {
        "distinct_colors" : {
            "cardinality" : {
              "field" : "color"
            }
        }
    }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/300_Aggregations/60_cardinality.json"></div>
<p>This returns a minimal response showing that we have sold three different-colored
cars:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">...
"aggregations": {
  "distinct_colors": {
     "value": 3
  }
}
...</pre>
</div>
<p>We can make our example more useful:  how many colors were sold each month?  For
that metric, we just nest the <code class="literal">cardinality</code> metric under a <code class="literal">date_histogram</code>:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /cars/transactions/_search
{
  "size" : 0,
  "aggs" : {
      "months" : {
        "date_histogram": {
          "field": "sold",
          "interval": "month"
        },
        "aggs": {
          "distinct_colors" : {
              "cardinality" : {
                "field" : "color"
              }
          }
        }
      }
  }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/300_Aggregations/60_cardinality.json"></div>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_understanding_the_trade_offs"></a>Understanding the Trade-offs<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/300_Aggregations/60_cardinality.asciidoc">edit</a>
</h3>
</div></div></div>
<p>As mentioned at the top of this chapter, the <code class="literal">cardinality</code> metric is an approximate
algorithm.  It is based on the <a href="http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf" class="ulink" target="_top">HyperLogLog++</a> (HLL) algorithm.  HLL works by
hashing your input and using the bits from the hash to make probabilistic estimations
on the cardinality.</p>
<p>You don’t need to understand the technical details (although if you’re interested,
the paper is a great read!), but you should be aware of the <em>properties</em> of the
algorithm:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
Configurable precision, which controls memory usage (more precise
== more memory).
</li>
<li class="listitem">
Excellent accuracy on low-cardinality sets.
</li>
<li class="listitem">
Fixed memory usage. Whether there are thousands or billions of unique
values, memory usage depends on only the configured precision.
</li>
</ul>
</div>
<p>To configure the precision, you must specify the <code class="literal">precision_threshold</code> parameter.
This threshold defines the point under which cardinalities are expected to be very
close to accurate. Consider this example:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /cars/transactions/_search
{
    "size" : 0,
    "aggs" : {
        "distinct_colors" : {
            "cardinality" : {
              "field" : "color",
              "precision_threshold" : 100 <a id="CO204-1"></a><i class="conum" data-value="1"></i>
            }
        }
    }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/300_Aggregations/60_cardinality.json"></div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO204-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p><code class="literal">precision_threshold</code> accepts a number from 0–40,000.  Larger values
are treated as equivalent to 40,000.</p>
</td>
</tr>
</table>
</div>
<p>This example will ensure that fields with 100 or fewer distinct values will be extremely accurate.
Although not guaranteed by the algorithm, if a cardinality is under the threshold,
it is almost always 100% accurate.  Cardinalities above this will begin to trade
accuracy for memory savings, and a little error will creep into the metric.</p>
<p>For a given threshold, the HLL data-structure will use about
<code class="literal">precision_threshold * 8</code> bytes of memory.  So you must balance how much memory
you are willing to sacrifice for additional accuracy.</p>
<p>Practically speaking, a threshold of <code class="literal">100</code> maintains an error under 5% even when
counting millions of unique values.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_optimizing_for_speed"></a>Optimizing for Speed<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/300_Aggregations/60_cardinality.asciidoc">edit</a>
</h3>
</div></div></div>
<p>If you want a distinct count, you <em>usually</em> want to query your entire dataset
(or nearly all of it).  Any operation on all your data needs to execute quickly,
for obvious reasons. HyperLogLog is very fast already—​it simply
hashes your data and does some bit-twiddling.</p>
<p>But if speed is important to you, we can optimize it a little bit further.
Since HLL simply needs the hash of the field, we can precompute that hash at
index time.  When the query executes, we can skip the hash computation and load
the value directly out of fielddata.</p>
<div class="note admon">
<div class="icon"></div>
<div class="admon_content">
<p>Precomputing hashes is useful only on very large and/or high-cardinality
fields. Calculating the hash on these fields is non-negligible at query time.</p>
<p>However, numeric fields hash very quickly, and storing the original numeric often
requires the same (or less) memory. This is also true on low-cardinality string
fields; there are internal optimizations that guarantee that hashes are
calculated only once per unique value.</p>
<p>Basically, precomputing hashes is not guaranteed to make all fields faster — only those that have high cardinality and/or large strings.  And remember,
precomputing simply shifts the cost to index time.  You still pay the price;
you just choose <em>when</em> to pay it.</p>
</div>
</div>
<p>To do this, we need to add a new multifield to our data.  We’ll delete our index,
add a new mapping that includes the hashed field, and then reindex:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">DELETE /cars/

PUT /cars/
{
  "mappings": {
    "transactions": {
      "properties": {
        "color": {
          "type": "string",
          "fields": {
            "hash": {
              "type": "murmur3" <a id="CO205-1"></a><i class="conum" data-value="1"></i>
            }
          }
        }
      }
    }
  }
}

POST /cars/transactions/_bulk
{ "index": {}}
{ "price" : 10000, "color" : "red", "make" : "honda", "sold" : "2014-10-28" }
{ "index": {}}
{ "price" : 20000, "color" : "red", "make" : "honda", "sold" : "2014-11-05" }
{ "index": {}}
{ "price" : 30000, "color" : "green", "make" : "ford", "sold" : "2014-05-18" }
{ "index": {}}
{ "price" : 15000, "color" : "blue", "make" : "toyota", "sold" : "2014-07-02" }
{ "index": {}}
{ "price" : 12000, "color" : "green", "make" : "toyota", "sold" : "2014-08-19" }
{ "index": {}}
{ "price" : 20000, "color" : "red", "make" : "honda", "sold" : "2014-11-05" }
{ "index": {}}
{ "price" : 80000, "color" : "red", "make" : "bmw", "sold" : "2014-01-01" }
{ "index": {}}
{ "price" : 25000, "color" : "blue", "make" : "ford", "sold" : "2014-02-12" }</pre>
</div>
<div class="sense_widget" data-snippet="snippets/300_Aggregations/60_cardinality.json"></div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO205-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>This multifield is of type <code class="literal">murmur3</code>, which is a hashing function.</p>
</td>
</tr>
</table>
</div>
<p>Now when we run an aggregation, we use the <code class="literal">color.hash</code> field instead of the
<code class="literal">color</code> field:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">GET /cars/transactions/_search
{
    "size" : 0,
    "aggs" : {
        "distinct_colors" : {
            "cardinality" : {
              "field" : "color.hash" <a id="CO206-1"></a><i class="conum" data-value="1"></i>
            }
        }
    }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/300_Aggregations/60_cardinality.json"></div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO206-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>Notice that we specify the hashed multifield, rather than the original.</p>
</td>
</tr>
</table>
</div>
<p>Now the <code class="literal">cardinality</code> metric will load the values (the precomputed hashes)
from <code class="literal">"color.hash"</code> and use those in place of dynamically hashing the original
value.</p>
<p>The savings per document is small, but if hashing each field adds 10 nanoseconds and your aggregation touches 100 million documents, that adds 1 second per
query.  If you find yourself using <code class="literal">cardinality</code> across many documents,
perform some profiling to see if precomputing hashes makes sense for your
deployment.</p>
</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="_approximate_aggregations.html">« Approximate Aggregations</a>
</span>
<span class="next">
<a href="percentiles.html">Calculating Percentiles »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
